################################################################################
## setup
################################################################################
# clean slate
rm(list = ls())
date()

# load packages
pkg <- c("foreign",
         "tidyverse",
         "RColorBrewer", 
         "gridExtra",
         "tools", 
         "lfe", 
         "lme4",
         "lmerTest",
         "stargazer",
         "multcomp",
         "rms")
lapply(pkg, require, character.only = TRUE)

# set directory
MAIN_DIR <- "~/Dropbox/Research/Liao_Newman_Malhotra/replication/"

# set file directories
FIG_DIR <- paste(MAIN_DIR, "figures/", sep = "")
TABLE_DIR <- paste(MAIN_DIR, "tables/", sep = "")
RESULTS <- paste(MAIN_DIR, "results/", sep = "")

# create folder if doesn't exist
if (!dir.exists(FIG_DIR)) {
  print("Directory does not exist! Creating...")
  dir.create(FIG_DIR)
}
if (!dir.exists(TABLE_DIR)) {
  print("Directory does not exist! Creating...")
  dir.create(TABLE_DIR)
}
if (!dir.exists(RESULTS)) {
  print("Directory does not exist! Creating...")
  dir.create(RESULTS)
}

# first time running code?
first.time <- TRUE # fresh run of everything but simulations will take time
#first.time <- FALSE # saves time by loading pre-saved outputs

# load data
load(file = paste(MAIN_DIR, "pew-merge.RData", sep = ""))

# set data
df <- pew.merge %>% 
  filter(!is.na(zcta))


################################################################################
# set var and var name correspondence
################################################################################
var.df <- tibble(var = c("anticorrupt",
                         "log_china_undergraduate",
                         "log_china_graduate",
                         "log_india_undergraduate",
                         "anticorrupt:log_china_undergraduate",
                         "anticorrupt:log_china_graduate",
                         "anticorrupt:log_india_undergraduate",
                         
                         # individual-level controls
                         "age",
                         "education",
                         "white",
                         "income",
                         "pid5",
                         "ideology",
                         
                         # zcta controls
                         "median_household_income_zcta",
                         "anticorrupt:median_household_income_zcta",
                         
                         "ln_pop_density_zcta",
                         "anticorrupt:ln_pop_density_zcta",
                         
                         "share_enroll_zcta",
                         "anticorrupt:share_enroll_zcta"),
                 var_name = c("Anti-corruption Campaign (dummy)",
                              "Log Chinese Undergrads in ZIP Code",
                              "Log Chinese Grads in ZIP Code",
                              "Log Indian Undergrads in ZIP code",
                              "Log Chinese Undergrads in ZIP Code x Anti-corruption Campaign (dummy)",
                              "Log Chinese Grads in ZIP Code x Anti-corruption Campaign (dummy)",
                              "Log Indian Undergrads in ZIP code x Anti-corruption Campaign (dummy)",
                              
                              # individual-level controls
                              "Respondent's Age",
                              "Respondent's Highest Level of Education",
                              "Respondent's Race: White (dummy)",
                              "Respondent's Family Income Level",
                              "Respondent's Party Identification",
                              "Respondent's Political Ideology",
                              
                              # zcta controls
                              "Median Household Income (10,000)",
                              "Median Household Income (10,000) x Anti-corruption Campaign (dummy)",
                              
                              "Log Population Density",
                              "Log Population Density x Anti-corruption Campaign (dummy)",
                              
                              "Share Enrolled in College or Above",
                              "Share Enrolled in College or Above x Anti-corruption Campaign (dummy)"))
                             

# function to replace variable names
replaceVarName <- function(var.vec, var.df){
  # Prepare output vector
  out.vec <- rep(NA, length(var.vec))
  matches <- match(var.vec, var.df$var)
  out.vec <- var.df[matches,]$var_name
  
  if(any(is.na(out.vec))){
    warning(paste("Variable concordence missing: ", 
                  paste(var.vec[is.na(out.vec)], collapse = ", "), 
                  sep = ""))
  } else{
    #print("All variables successfully converted")
  }
  
  return(out.vec)
}


################################################################################
# Table S12. DiD Regression Results: Perceived China Threat
################################################################################
# Column (1)
did.cn.ols.b <- felm(cnthreat ~ anticorrupt * log_china_undergraduate 
                         | 0 | 0 | zcta, # FEs | IVs | Clustered SEs
                         data = df)
summary(did.cn.ols.b)

# Column (2)
did.cn.ols.i <- felm(cnthreat ~ anticorrupt * log_china_undergraduate +
                           age + education + white + income + pid5 + ideology
                         | 0 | 0 | zcta, # FEs | IVs | Clustered SEs
                         data = df)
summary(did.cn.ols.i)

# Column (3)
did.cn.ols.f <- felm(cnthreat ~ anticorrupt * log_china_undergraduate +
                          age + education + white + income + pid5 + ideology +
                          anticorrupt * median_household_income_zcta +
                          anticorrupt * ln_pop_density_zcta +
                          anticorrupt * share_enroll_zcta
                        | 0 | 0 | zcta, # FEs | IVs | Clustered SEs
                        keepX = TRUE,
                        data = df)
summary(did.cn.ols.f)

# marginal effects
summary(glht(did.cn.ols.f,
             linfct = c("log_china_undergraduate + anticorrupt:log_china_undergraduate = 0")))

# Column (4)
lme.cn <- lmer(cnthreat ~ anticorrupt * log_china_undergraduate +
                 age + education + white + income + pid5 + ideology +
                 anticorrupt * median_household_income_zcta +
                 anticorrupt * ln_pop_density_zcta +
                 anticorrupt * share_enroll_zcta +
                 (1 | zcta), 
               data = df)
summary(lme.cn)

# Column (5)
did.cn.ologit.f <- lrm(cnthreat ~ anticorrupt * log_china_undergraduate +
                         age + education + white + income + pid5 + ideology +
                         anticorrupt * median_household_income_zcta +
                         anticorrupt * ln_pop_density_zcta +
                         anticorrupt * share_enroll_zcta,
                         x = TRUE, y = TRUE,
                         data = df)
print(did.cn.ologit.f)

# clustered SE
did.cn.ologit.f.cse <- robcov(did.cn.ologit.f, df$zcta) #robust errors
print(did.cn.ologit.f.cse)

# make variable names consistent with other models
names(did.cn.ologit.f.cse$coefficients)[names(did.cn.ologit.f.cse$coefficients) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
names(did.cn.ologit.f.cse$coefficients)[names(did.cn.ologit.f.cse$coefficients) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
names(did.cn.ologit.f.cse$coefficients)[names(did.cn.ologit.f.cse$coefficients) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
names(did.cn.ologit.f.cse$coefficients)[names(did.cn.ologit.f.cse$coefficients) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
rownames(did.cn.ologit.f.cse$var)[rownames(did.cn.ologit.f.cse$var) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
colnames(did.cn.ologit.f.cse$var)[colnames(did.cn.ologit.f.cse$var) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
rownames(did.cn.ologit.f.cse$var)[rownames(did.cn.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
colnames(did.cn.ologit.f.cse$var)[colnames(did.cn.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
rownames(did.cn.ologit.f.cse$var)[rownames(did.cn.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
colnames(did.cn.ologit.f.cse$var)[colnames(did.cn.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
rownames(did.cn.ologit.f.cse$var)[rownames(did.cn.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
colnames(did.cn.ologit.f.cse$var)[colnames(did.cn.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"

# set outputs
did.main <- list(did.cn.ols.b,
                 did.cn.ols.i,
                 did.cn.ols.f,
                 lme.cn,
                 did.cn.ologit.f.cse)

# set p-values
vv.ologit.main <- diag(did.cn.ologit.f.cse$var)
cof.ologit.main <- did.cn.ologit.f.cse$coef
z.ologit.main <- cof.ologit.main/sqrt(vv.ologit.main)
p.ologit.main <- 1 - pchisq(z.ologit.main^2, 1)

pvalue.main <- list(round(did.cn.ols.b$cpval, 3),
                    round(did.cn.ols.i$cpval, 3),
                    round(did.cn.ols.f$cpval, 3),
                    round(summary(lme.cn)$coefficients[,"Pr(>|t|)"], 3),
                    round(p.ologit.main, 3))

# set CIs
lme.cn.ci <- confint(lme.cn)

ci.main <- list(confint(did.cn.ols.b), 
                confint(did.cn.ols.i),
                confint(did.cn.ols.f),
                lme.cn.ci,
                confint.default(did.cn.ologit.f.cse))

# set model names
mod.names.main <- c("OLS Baseline", 
                    "OLS Extended", 
                    "OLS Full",
                    "Linear Mixed Effects",
                    "Ordered Logit")

# set var order
var.order <- c("^anticorrupt$",
               "^log_china_undergraduate$",
               "^anticorrupt:log_china_undergraduate$",
               
               # individual-level controls
               "^age$",
               "^education$",
               "^white$",
               "^income$",
               "^pid5$",
               "^ideology$",
               
               # zcta controls
               "^median_household_income_zcta$",
               "^anticorrupt:median_household_income_zcta$",
               
               "^ln_pop_density_zcta$",
               "^anticorrupt:ln_pop_density_zcta$",
               
               "^share_enroll_zcta$",
               "^anticorrupt:share_enroll_zcta$",
               "^y>=2$",
               "^y>=3$")

# set var label
var.label <- str_replace_all(var.order, "\\^", "")
var.label <- str_replace_all(var.label, "\\$", "")

# save
# NOTE that a bug in stargazer causes it to use default p-values instead of 
# replacing them with the p-values we have calculated above. Results are 
# usually the same up to two decimal points. When there is a slight difference, 
# we report in the SI the p-values calculated above.
sink(file.path(TABLE_DIR, "Table-S12-pew-main.txt"))
#sink(file.path(TABLE_DIR, "Table-S12-pew-main.tex"))
stargazer(did.main,
          p = pvalue.main, # does not seem to work under report = ("vcsp"), corrected in SI
          p.auto = FALSE,
          t.auto = FALSE,
          ci = TRUE,
          ci.custom = ci.main,
          report = ("vcsp"),
          digits = 3, 
          #type = "latex",
          type = "text",
          title = "DiD Regression Results: Perceived China Threat",
          dep.var.labels = "Perceived China Threat",
          model.names = FALSE,
          column.labels = mod.names.main,
          order = var.order,
          covariate.labels = replaceVarName(var.vec = var.label,
                                            var.df = var.df),
          label = "tb:did-pew-main",
          single.row = FALSE,
          df = FALSE,
          font.size = "tiny",
          add.lines = list(c("Varying Intercepts: ZIP Code",
                             c("No", "No", "No", "Yes", "No")),
                           c("Number of ZIP Codes", 
                             formatC(length(levels(did.cn.ols.b$clustervar$zcta)), big.mark = ","),
                             formatC(length(levels(did.cn.ols.i$clustervar$zcta)), big.mark = ","),
                             formatC(length(levels(did.cn.ols.f$clustervar$zcta)), big.mark = ","),
                             formatC(length(levels(lme.cn@frame$zcta)), big.mark = ","),
                             formatC(did.cn.ologit.f.cse$clusterInfo$n, big.mark = ","))
                           ),
          star.cutoffs = c(0.1, 0.05, 0.01),
          star.char = c("*", "**", "***"),
          #notes = c("Robust standard errors clustered by ZIP code in parentheses. $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01"),
          notes = c("95\\% confidence intervals and p-values are presented. Calculations are based on robust standard errors clustered by ZIP code."),
          notes.align = "l",
          notes.append = FALSE)
sink()


################################################################################
# Table S13. DiD Regression Results: Placebo Outcomes
################################################################################
# North Korea 
# Columns (1): OLS
did.nk.ols.f <- felm(koreathreat ~ anticorrupt * log_china_undergraduate +
                          age + education + white + income + pid5 + ideology +
                           anticorrupt * median_household_income_zcta +
                           anticorrupt * ln_pop_density_zcta +
                           anticorrupt * share_enroll_zcta
                       | 0 | 0 | zcta, # FEs | IVs | Clustered SEs
                       data = df)
summary(did.nk.ols.f)

# Column (2): Ordered Logit
did.nk.ologit.f <- lrm(koreathreat ~ anticorrupt * log_china_undergraduate +
                         age + education + white + income + pid5 + ideology +
                         anticorrupt * median_household_income_zcta +
                         anticorrupt * ln_pop_density_zcta +
                         anticorrupt * share_enroll_zcta,
                         x = TRUE, y = TRUE,
                         data = df)
print(did.nk.ologit.f)

# clustered SE
did.nk.ologit.f.cse <- robcov(did.nk.ologit.f, df$zcta) #robust errors
print(did.nk.ologit.f.cse)

# Russia
# Column (3): OLS
did.ru.ols.f <- felm(russiathreat ~ anticorrupt * log_china_undergraduate +
                         age + education + white + income + pid5 + ideology +
                         anticorrupt * median_household_income_zcta +
                         anticorrupt * ln_pop_density_zcta +
                         anticorrupt * share_enroll_zcta
                        | 0 | 0 | zcta, # FEs | IVs | Clustered SEs
                        data = df)
summary(did.ru.ols.f)

# Column (4): Ordered Logit
did.ru.ologit.f <- lrm(russiathreat ~ anticorrupt * log_china_undergraduate +
                          age + education + white + income + pid5 + ideology +
                           anticorrupt * median_household_income_zcta +
                           anticorrupt * ln_pop_density_zcta +
                           anticorrupt * share_enroll_zcta,
                         x = TRUE, y = TRUE,
                         data = df)
print(did.ru.ologit.f)

# clustered SE
did.ru.ologit.f.cse <- robcov(did.ru.ologit.f, df$zcta) #robust errors
print(did.ru.ologit.f.cse)

# Iran
# Column (5): OLS
did.ir.ols.f <- felm(iranthreat ~ anticorrupt * log_china_undergraduate +
                          age + education + white + income + pid5 + ideology +
                          anticorrupt * median_household_income_zcta +
                          anticorrupt * ln_pop_density_zcta +
                          anticorrupt * share_enroll_zcta
                         | 0 | 0 | zcta, # FEs | IVs | Clustered SEs
                         data = df)
summary(did.ir.ols.f)

# Column (6): Ordered Logit
did.ir.ologit.f <- lrm(iranthreat ~ anticorrupt * log_china_undergraduate +
                         age + education + white + income + pid5 + ideology +
                         anticorrupt * median_household_income_zcta +
                         anticorrupt * ln_pop_density_zcta +
                         anticorrupt * share_enroll_zcta,
                         x = TRUE, y = TRUE,
                         data = df)
print(did.ir.ologit.f)

# clustered SE
did.ir.ologit.f.cse <- robcov(did.ir.ologit.f, df$zcta) #robust errors
print(did.ir.ologit.f.cse)

# make variable names consistent across models
# North Korea
names(did.nk.ologit.f.cse$coefficients)[names(did.nk.ologit.f.cse$coefficients) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
names(did.nk.ologit.f.cse$coefficients)[names(did.nk.ologit.f.cse$coefficients) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
names(did.nk.ologit.f.cse$coefficients)[names(did.nk.ologit.f.cse$coefficients) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
names(did.nk.ologit.f.cse$coefficients)[names(did.nk.ologit.f.cse$coefficients) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
rownames(did.nk.ologit.f.cse$var)[rownames(did.nk.ologit.f.cse$var) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
colnames(did.nk.ologit.f.cse$var)[colnames(did.nk.ologit.f.cse$var) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
rownames(did.nk.ologit.f.cse$var)[rownames(did.nk.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
colnames(did.nk.ologit.f.cse$var)[colnames(did.nk.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
rownames(did.nk.ologit.f.cse$var)[rownames(did.nk.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
colnames(did.nk.ologit.f.cse$var)[colnames(did.nk.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
rownames(did.nk.ologit.f.cse$var)[rownames(did.nk.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
colnames(did.nk.ologit.f.cse$var)[colnames(did.nk.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"

# Russia
names(did.ru.ologit.f.cse$coefficients)[names(did.ru.ologit.f.cse$coefficients) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
names(did.ru.ologit.f.cse$coefficients)[names(did.ru.ologit.f.cse$coefficients) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
names(did.ru.ologit.f.cse$coefficients)[names(did.ru.ologit.f.cse$coefficients) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
names(did.ru.ologit.f.cse$coefficients)[names(did.ru.ologit.f.cse$coefficients) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
rownames(did.ru.ologit.f.cse$var)[rownames(did.ru.ologit.f.cse$var) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
colnames(did.ru.ologit.f.cse$var)[colnames(did.ru.ologit.f.cse$var) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
rownames(did.ru.ologit.f.cse$var)[rownames(did.ru.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
colnames(did.ru.ologit.f.cse$var)[colnames(did.ru.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
rownames(did.ru.ologit.f.cse$var)[rownames(did.ru.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
colnames(did.ru.ologit.f.cse$var)[colnames(did.ru.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
rownames(did.ru.ologit.f.cse$var)[rownames(did.ru.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
colnames(did.ru.ologit.f.cse$var)[colnames(did.ru.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"


# Iran
names(did.ir.ologit.f.cse$coefficients)[names(did.ir.ologit.f.cse$coefficients) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
names(did.ir.ologit.f.cse$coefficients)[names(did.ir.ologit.f.cse$coefficients) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
names(did.ir.ologit.f.cse$coefficients)[names(did.ir.ologit.f.cse$coefficients) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
names(did.ir.ologit.f.cse$coefficients)[names(did.ir.ologit.f.cse$coefficients) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
rownames(did.ir.ologit.f.cse$var)[rownames(did.ir.ologit.f.cse$var) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
colnames(did.ir.ologit.f.cse$var)[colnames(did.ir.ologit.f.cse$var) == "anticorrupt * log_china_undergraduate"] <- "anticorrupt:log_china_undergraduate"
rownames(did.ir.ologit.f.cse$var)[rownames(did.ir.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
colnames(did.ir.ologit.f.cse$var)[colnames(did.ir.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
rownames(did.ir.ologit.f.cse$var)[rownames(did.ir.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
colnames(did.ir.ologit.f.cse$var)[colnames(did.ir.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
rownames(did.ir.ologit.f.cse$var)[rownames(did.ir.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
colnames(did.ir.ologit.f.cse$var)[colnames(did.ir.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"

# set outputs
did.p.o <- list(did.nk.ols.f,
                did.nk.ologit.f.cse,
                did.ru.ols.f,
                did.ru.ologit.f.cse,
                did.ir.ols.f,
                did.ir.ologit.f.cse)

# set p-values
# nk
vv.nk <- diag(did.nk.ologit.f.cse$var)
cof.nk <- did.nk.ologit.f.cse$coef
z.nk <- cof.nk/sqrt(vv.nk)
p.nk <- 1 - pchisq(z.nk^2, 1)

# ru
vv.ru <- diag(did.ru.ologit.f.cse$var)
cof.ru <- did.ru.ologit.f.cse$coef
z.ru <- cof.ru/sqrt(vv.ru)
p.ru <- 1 - pchisq(z.ru^2, 1)

# ir
vv.ir <- diag(did.ir.ologit.f.cse$var)
cof.ir <- did.ir.ologit.f.cse$coef
z.ir <- cof.ir/sqrt(vv.ir)
p.ir <- 1 - pchisq(z.ir^2, 1)

pvalue.p.o <- list(round(did.nk.ols.f$cpval, 3),
                   round(p.nk, 3),
                   round(did.ru.ols.f$cpval, 3),
                   round(p.ru, 3),
                   round(did.ir.ols.f$cpval, 3),
                   round(p.ir, 3))

# set CIs
ci.p.o <- list(confint(did.nk.ols.f),
               confint.default(did.nk.ologit.f.cse),
               confint(did.ru.ols.f),
               confint.default(did.ru.ologit.f.cse),
               confint(did.ir.ols.f),
               confint.default(did.ir.ologit.f.cse))

# set model names
mod.names.p.o <- c("OLS: PRK", 
                   "Ologit: PRK", 
                   "OLS: RUS",
                   "Ologit: RUS",
                   "OLS: IRN",
                   "Ologit: IRN")

# save
# NOTE that a bug in stargazer causes it to use default p-values instead of 
# replacing them with the p-values we have calculated above. Results are 
# usually the same up to two decimal points. When there is a slight difference, 
# we report in the SI the p-values calculated above.
sink(file.path(TABLE_DIR, "Table-S13-pew-placebo-dv.txt"))
#sink(file.path(TABLE_DIR, "Table-S13-pew-placebo-dv.tex"))
stargazer(did.p.o,
          p = pvalue.p.o, # does not seem to work under report = ("vcsp"), corrected in SI
          p.auto = FALSE,
          t.auto = FALSE,
          ci = TRUE,
          ci.custom = ci.p.o,
          report = ("vcsp"),
          digits = 3, 
          #type = "latex",
          type = "text",
          title = "DiD Regression Results: Placebo Outcomes",
          dep.var.labels = "Perceived Threat from Placebo Countries",
          dep.var.labels.include = TRUE,
          model.names = FALSE,
          column.labels = mod.names.p.o,
          order = var.order,
          covariate.labels = replaceVarName(var.vec = var.label,
                                            var.df = var.df),
          label = "tb:did-pew-placebo-dv",
          single.row = FALSE,
          df = FALSE,
          font.size = "tiny",
          add.lines = list(c("Number of ZIP Codes", 
                             formatC(length(levels(did.nk.ols.f$clustervar$zcta)), big.mark = ","),
                             formatC(did.nk.ologit.f.cse$clusterInfo$n, big.mark = ","),
                             formatC(length(levels(did.ru.ols.f$clustervar$zcta)), big.mark = ","),
                             formatC(did.ru.ologit.f.cse$clusterInfo$n, big.mark = ","),
                             formatC(length(levels(did.ir.ols.f$clustervar$zcta)), big.mark = ","),
                             formatC(did.ir.ologit.f.cse$clusterInfo$n, big.mark = ","))
          ),
          star.cutoffs = c(0.1, 0.05, 0.01),
          star.char = c("*", "**", "***"),
          #notes = c("Robust standard errors clustered by ZIP code in parentheses. $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01. The drop in observations for Russia is due to the Pew survey not asking questions regarding perceived Russian threat in October 2013."),
          notes = c("95\\% confidence intervals and p-values are presented. Calculations are based on robust standard errors clustered by ZIP code. The drop in observations for Russia is due to the Pew survey not asking questions regarding perceived Russian threat in October 2013."),
          notes.align = "l",
          notes.append = FALSE)
sink()


################################################################################
# Table S14. DiD Regression Results: Placebo Student Treatments
################################################################################
# China graduates
# Column (1): OLS
did.cn.grad.ols.f <- felm(cnthreat ~ anticorrupt * log_china_graduate +
                            age + education + white + income + pid5 + ideology +
                            anticorrupt * median_household_income_zcta +
                            anticorrupt * ln_pop_density_zcta +
                            anticorrupt * share_enroll_zcta
                          | 0 | 0 | zcta, # FEs | IVs | Clustered SEs
                          data = df)
summary(did.cn.grad.ols.f)
# summary(glht(did.cn.grad.ols.f, 
#              linfct = c("log_china_graduate + anticorrupt:log_china_graduate = 0")))

# Column (2): Ordered Logit
did.cn.grad.ologit.f <- lrm(cnthreat ~ anticorrupt * log_china_graduate +
                              age + education + white + income + pid5 + ideology +
                              anticorrupt * median_household_income_zcta +
                              anticorrupt * ln_pop_density_zcta +
                              anticorrupt * share_enroll_zcta,
                            x = TRUE, y = TRUE,
                            data = df)
print(did.cn.grad.ologit.f)

# clustered SE
did.cn.grad.ologit.f.cse <- robcov(did.cn.grad.ologit.f, df$zcta) #robust errors
print(did.cn.grad.ologit.f.cse)

# India undergraduates
# Column (3): OLS full
did.ind.ols.f <- felm(cnthreat ~ anticorrupt * log_india_undergraduate +
                          age + education + white + income + pid5 + ideology +
                          anticorrupt * median_household_income_zcta +
                          anticorrupt * ln_pop_density_zcta +
                          anticorrupt * share_enroll_zcta
                       | 0 | 0 | zcta, # FEs | IVs | Clustered SEs
                       data = df)
summary(did.ind.ols.f)

# Column (4): Ordered Logit
did.ind.ologit.f <- lrm(cnthreat ~ anticorrupt * log_india_undergraduate +
                          age + education + white + income + pid5 + ideology +
                          anticorrupt * median_household_income_zcta +
                          anticorrupt * ln_pop_density_zcta +
                          anticorrupt * share_enroll_zcta,
                        x = TRUE, y = TRUE,
                        data = df)
print(did.ind.ologit.f)

# clustered SE
did.ind.ologit.f.cse <- robcov(did.ind.ologit.f, df$zcta) #robust errors
print(did.ind.ologit.f.cse)

# make variable names consistent across models
# CHN grads
names(did.cn.grad.ologit.f.cse$coefficients)[names(did.cn.grad.ologit.f.cse$coefficients) == "anticorrupt * log_china_graduate"] <- "anticorrupt:log_china_graduate"
names(did.cn.grad.ologit.f.cse$coefficients)[names(did.cn.grad.ologit.f.cse$coefficients) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
names(did.cn.grad.ologit.f.cse$coefficients)[names(did.cn.grad.ologit.f.cse$coefficients) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
names(did.cn.grad.ologit.f.cse$coefficients)[names(did.cn.grad.ologit.f.cse$coefficients) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
rownames(did.cn.grad.ologit.f.cse$var)[rownames(did.cn.grad.ologit.f.cse$var) == "anticorrupt * log_china_graduate"] <- "anticorrupt:log_china_graduate"
colnames(did.cn.grad.ologit.f.cse$var)[colnames(did.cn.grad.ologit.f.cse$var) == "anticorrupt * log_china_graduate"] <- "anticorrupt:log_china_graduate"
rownames(did.cn.grad.ologit.f.cse$var)[rownames(did.cn.grad.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
colnames(did.cn.grad.ologit.f.cse$var)[colnames(did.cn.grad.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
rownames(did.cn.grad.ologit.f.cse$var)[rownames(did.cn.grad.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
colnames(did.cn.grad.ologit.f.cse$var)[colnames(did.cn.grad.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
rownames(did.cn.grad.ologit.f.cse$var)[rownames(did.cn.grad.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
colnames(did.cn.grad.ologit.f.cse$var)[colnames(did.cn.grad.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"

# IND Undergrads
names(did.ind.ologit.f.cse$coefficients)[names(did.ind.ologit.f.cse$coefficients) == "anticorrupt * log_india_undergraduate"] <- "anticorrupt:log_india_undergraduate"
names(did.ind.ologit.f.cse$coefficients)[names(did.ind.ologit.f.cse$coefficients) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
names(did.ind.ologit.f.cse$coefficients)[names(did.ind.ologit.f.cse$coefficients) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
names(did.ind.ologit.f.cse$coefficients)[names(did.ind.ologit.f.cse$coefficients) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
rownames(did.ind.ologit.f.cse$var)[rownames(did.ind.ologit.f.cse$var) == "anticorrupt * log_india_undergraduate"] <- "anticorrupt:log_india_undergraduate"
colnames(did.ind.ologit.f.cse$var)[colnames(did.ind.ologit.f.cse$var) == "anticorrupt * log_india_undergraduate"] <- "anticorrupt:log_india_undergraduate"
rownames(did.ind.ologit.f.cse$var)[rownames(did.ind.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
colnames(did.ind.ologit.f.cse$var)[colnames(did.ind.ologit.f.cse$var) == "anticorrupt * median_household_income_zcta"] <- "anticorrupt:median_household_income_zcta"
rownames(did.ind.ologit.f.cse$var)[rownames(did.ind.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
colnames(did.ind.ologit.f.cse$var)[colnames(did.ind.ologit.f.cse$var) == "anticorrupt * ln_pop_density_zcta"] <- "anticorrupt:ln_pop_density_zcta"
rownames(did.ind.ologit.f.cse$var)[rownames(did.ind.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"
colnames(did.ind.ologit.f.cse$var)[colnames(did.ind.ologit.f.cse$var) == "anticorrupt * share_enroll_zcta"] <- "anticorrupt:share_enroll_zcta"

# set outputs
did.p.t <- list(did.cn.grad.ols.f,
                did.cn.grad.ologit.f.cse,
                did.ind.ols.f,
                did.ind.ologit.f.cse)

# set p-values
# CN grads
vv.cn.grad <- diag(did.cn.grad.ologit.f.cse$var)
cof.cn.grad <- did.cn.grad.ologit.f.cse$coef
z.cn.grad <- cof.cn.grad/sqrt(vv.cn.grad)
p.cn.grad <- 1 - pchisq(z.cn.grad^2, 1)

# IN undergrads
vv.ind.under <- diag(did.ind.ologit.f.cse$var)
cof.ind.under <- did.ind.ologit.f.cse$coef
z.ind.under <- cof.ind.under/sqrt(vv.ind.under)
p.ind.under <- 1 - pchisq(z.ind.under^2, 1)

pvalue.p.t <- list(round(did.cn.grad.ols.f$cpval, 3),
                   round(p.cn.grad, 3),
                   round(did.ind.ols.f$cpval, 3),
                   round(p.ind.under, 3))

# set CIs
ci.p.t <- list(confint(did.cn.grad.ols.f),
               confint.default(did.cn.grad.ologit.f.cse),
               confint(did.ind.ols.f),
               confint.default(did.ind.ologit.f.cse))

# set model names
mod.names.p.t <- c("OLS: CHN Grads", 
                   "Ologit: CHN Grads", 
                   "OLS: IND Undergrads",
                   "Ologit: IND Undergrads")

# set var order
var.order.p.t <- c("^anticorrupt$",
                   "^log_china_graduate$",
                   "^anticorrupt:log_china_graduate$",
                   "^log_india_undergraduate$",
                   "^anticorrupt:log_india_undergraduate$",
                   
                   # individual-level controls
                   "^age$",
                   "^education$",
                   "^white$",
                   "^income$",
                   "^pid5$",
                   "^ideology$",
                   
                   # zcta controls
                   "^median_household_income_zcta$",
                   "^anticorrupt:median_household_income_zcta$",
                   
                   "^ln_pop_density_zcta$",
                   "^anticorrupt:ln_pop_density_zcta$",
                   
                   "^share_enroll_zcta$",
                   "^anticorrupt:share_enroll_zcta$",
                   "^y>=2$",
                   "^y>=3$")

# set var label
var.label.p.t <- str_replace_all(var.order.p.t, "\\^", "")
var.label.p.t <- str_replace_all(var.label.p.t, "\\$", "")

# save
# NOTE that a bug in stargazer causes it to use default p-values instead of 
# replacing them with the p-values we have calculated above. Results are 
# usually the same up to two decimal points. When there is a slight difference, 
# we report in the SI the p-values calculated above.
sink(file.path(TABLE_DIR, "Table-S14-pew-placebo-treat.txt"))
#sink(file.path(TABLE_DIR, "Table-S14-pew-placebo-treat.tex"))
stargazer(did.p.t,
          p = pvalue.p.t, # does not seem to work under report = ("vcsp"), corrected in SI
          p.auto = FALSE,
          t.auto = FALSE,
          ci = TRUE,
          ci.custom = ci.p.t,
          report = ("vcsp"),
          digits = 3, 
          #type = "latex",
          type = "text",
          title = "DiD Regression Results: Placebo Student Treatments",
          dep.var.labels = "Perceived China Threat",
          dep.var.labels.include = TRUE,
          model.names = FALSE,
          column.labels = mod.names.p.t,
          order = var.order.p.t,
          covariate.labels = replaceVarName(var.vec = var.label.p.t,
                                            var.df = var.df),
          label = "tb:did-pew-placebo-treat",
          single.row = FALSE,
          df = FALSE,
          font.size = "tiny",
          add.lines = list(c("Number of ZIP Codes", 
                             formatC(length(levels(did.cn.grad.ols.f$clustervar$zcta)), big.mark = ","),
                             formatC(did.cn.grad.ologit.f.cse$clusterInfo$n, big.mark = ","),
                             formatC(length(levels(did.ind.ols.f$clustervar$zcta)), big.mark = ","),
                             formatC(did.ind.ologit.f.cse$clusterInfo$n, big.mark = ","))
          ),
          star.cutoffs = c(0.1, 0.05, 0.01),
          star.char = c("*", "**", "***"),
          #notes = c("Robust standard errors clustered by ZIP code in parentheses. $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01"),
          notes = c("95\\% confidence intervals and p-values are presented. Calculations are based on robust standard errors clustered by ZIP code."),
          notes.align = "l",
          notes.append = FALSE)
sink()


################################################################################
## Figure 1b
################################################################################
# set alpha level
alpha <- 0.05
y.limits <- c(-0.1, 0.1)

# extract coefs and clustered SEs
coef.df.period <- tibble(period = c("Post-shock - Pre-shock"),
                         model = c("Base", "Extended", "Full"),
                         coef = c(coef(did.cn.ols.b)["anticorrupt:log_china_undergraduate"],
                                  coef(did.cn.ols.i)["anticorrupt:log_china_undergraduate"],
                                  coef(did.cn.ols.f)["anticorrupt:log_china_undergraduate"]),
                         cse = c(did.cn.ols.b$cse["anticorrupt:log_china_undergraduate"],
                                 did.cn.ols.i$cse["anticorrupt:log_china_undergraduate"],
                                 did.cn.ols.f$cse["anticorrupt:log_china_undergraduate"]))

# calculate C.I.
coef.df.period <- coef.df.period %>%
  mutate(multiplier = qnorm(1 - alpha/2),
         lwr = coef - multiplier * cse,
         upr = coef + multiplier * cse,
         period = factor(period, levels = c("Post-shock - Pre-shock")),
         model = factor(model, levels = c("Base", "Extended", "Full")))

coef.df.period

# df for annotations
txt.shift <- 0.01
txt.pos <- tibble(period = coef.df.period$period,
                  model = coef.df.period$model,
                  coef = c(coef.df.period[1,]$lwr - txt.shift,
                           coef.df.period[2,]$lwr - txt.shift,
                           coef.df.period[3,]$lwr - txt.shift),
                  lwr = coef.df.period$lwr,
                  upr = coef.df.period$upr)

# plot
axis.title.size <- 16

p.coef.main <- ggplot(data = coef.df.period,
                            aes(x = model, y = coef,
                                ymin = lwr, 
                                ymax = upr)) +
  geom_pointrange(size = 0.8) +
  geom_hline(yintercept = 0,
             color = I(hsv(0/12, 7/12, 7/12)), 
             linetype = "dashed") +
  geom_text(data = txt.pos,
            aes(label = model),
            size = axis.title.size - 11) +
  scale_x_discrete("Perceived Threat China") +
  scale_y_continuous("Post-Shock Diff. in Effect of CHN Int'l UG Students (log)",
                     limits = y.limits) +
  theme_bw() +  
  ggtitle("Pew Cross-Sectional Data\n2008-2015 Perceived China Threat") +
  theme(legend.position = "none",
        plot.title = element_text(size = axis.title.size + 4,
                                  face = "bold",
                                  hjust = 0.5,
                                  margin = margin(0, 0, 20, 0)),
        plot.margin = margin(0.1, 0.5, 0.1, 0.1, "cm"),
        axis.title.y = element_text(size = axis.title.size - 2,
                                    margin = margin(0, 20, 0, 0)),
        axis.text.y = element_text(size = axis.title.size),
        strip.background = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        strip.text = element_text(size = axis.title.size + 4,
                                  #face = "bold",
                                  margin = margin(20, 0, 20, 0)),
        panel.grid = element_blank()) 

p.coef.main

# save
pdf(file = paste(FIG_DIR, "Figure-1b.pdf", sep = "/"),
    width = 6.5, height = 5.8)
print(p.coef.main)
dev.off()

# ggsave(file = paste(FIG_DIR, "Figure-1b.eps", sep = "/"),
#        width = 6.5, height = 5.8)


################################################################################
# simulate effects: CNUG, predicted values, continuous predictor, original scale
################################################################################
# set key var range
key.var.length <- 50

# 2 sd above/below mean
range.key <- seq(ifelse((mean(df$log_china_undergraduate, na.rm = TRUE) -
                           2*sd(df$log_china_undergraduate, na.rm = TRUE)) < 0,
                        min(df$log_china_undergraduate, na.rm = TRUE)),
                 mean(df$log_china_undergraduate, na.rm = TRUE) +
                   2*sd(df$log_china_undergraduate, na.rm = TRUE),
                 length.out = key.var.length)
range.key

# range of moderator
range.mod <- sort(unique(df$anticorrupt))
range.mod

# set parameters
sim.df <- df
fm <- did.cn.ols.f
change.var <- "log_china_undergraduate"
moderator <- "anticorrupt"
interaction <- "anticorrupt:log_china_undergraduate"
range.key <- range.key
range.mod <- range.mod
n.draws <- 1000
dv.log <- FALSE
range.log <- TRUE

if (first.time){
  
  # set seed for simulation
  seed <- 1234
  set.seed(seed)
  
  # load library to draw from multivariate normal
  library(MASS)
  beta.draws <- mvrnorm(n.draws, coef(fm), fm$clustervcv)
  
  # extract model matrix
  model.df.full <- as.data.frame(fm$X)
  
  # for each value of change var
  sim.out.cn <- map_df(1:length(range.key), function(x){
    
    sim.out.sub <- map_df(1:length(range.mod), function(y){
      # print
      cat(paste("Simulating predicted probabilities for ", x, " of ",
                length(range.key), " ", change.var, " values when ", moderator,
                " = ", range.mod[[y]],
                "\n", sep = ""))
      
      # create temp data
      newd <- model.df.full
      
      if(range.log){
        # replace change var value x
        newd[,change.var] <- range.key[[x]]
        
        # replace moderator value y
        newd[,moderator] <- range.mod[[y]]
        
        # calculate new interaction
        newd[,interaction] <- range.key[[x]] * range.mod[[y]]
        
      } else{
        # replace change var value x
        newd[,change.var] <- log(range.key[[x]] + 1)
        
        # replace moderator value y
        newd[,moderator] <- range.mod[[y]]
        
        # calculate new interaction
        newd[,interaction] <- log(range.key[[x]] + 1) * range.mod[[y]]
        
      }
      
      # matrix multiplication for systematic component
      sys.com <- as.matrix(newd) %*% t(beta.draws)
      
      # average over observations for each draw
      ev <- as.data.frame(sys.com) %>%
        summarise_all(funs(mean))
      
      if(dv.log){
        # convert to original scale of outcome
        ev <- exp(ev) %>%
          mutate(mod = range.mod[[y]])
        
        return(ev)
        
      } else{
        ev <- ev %>%
          mutate(mod = range.mod[[y]])
        
        return(ev)
      }
    })
    
    # add set identifier
    sim.out <- sim.out.sub %>%
      mutate(setx = range.key[[x]])
    
    return(sim.out)
  })
  
  # save data sets
  save(sim.out.cn, file = paste(RESULTS, "pew-sim-out-cn-ug.RData", sep = ""))
  
} else {
  
  # load 
  load(file = paste(RESULTS, "pew-sim-out-cn-ug.RData", sep = ""))
  
}


################################################################################
# simulate effects: CNUG, 1 s.d. below mean to above, original scale 
# (for discussion in main text)
################################################################################
range.key <- c(0, # 1 s.d. below mean is negative
               mean(df$log_china_undergraduate, na.rm = TRUE) + 
                 sd(df$log_china_undergraduate, na.rm = TRUE))
range.key

# exposure on the scale of students
exp(range.key)

# range of moderator
range.mod <- sort(unique(df$anticorrupt))
range.mod

# set parameters
sim.df <- df
fm <- did.cn.ols.f
change.var <- "log_china_undergraduate"
moderator <- "anticorrupt"
interaction <- "anticorrupt:log_china_undergraduate"
range.key <- range.key
range.mod <- range.mod
n.draws <- 1000
dv.log <- FALSE
range.log <- TRUE

if (first.time){ 
  
  # set seed for simulation
  seed <- 1234
  set.seed(seed)
  
  # load library to draw from multivariate normal
  library(MASS)
  beta.draws <- mvrnorm(n.draws, coef(fm), fm$clustervcv)
  
  # extract model matrix
  model.df.full <- as.data.frame(fm$X)
  
  # for each value of change var
  sim.out.cn.fd <- map_df(1:length(range.key), function(x){
    
    sim.out.sub <- map_df(1:length(range.mod), function(y){
      # print
      cat(paste("Simulating predicted probabilities for ", x, " of ",
                length(range.key), " ", change.var, " values when ", moderator,
                " = ", range.mod[[y]],
                "\n", sep = ""))
      
      # create temp data
      newd <- model.df.full
      
      if(range.log){
        # replace change var value x
        newd[,change.var] <- range.key[[x]]
        
        # replace moderator value y
        newd[,moderator] <- range.mod[[y]]
        
        # calculate new interaction
        newd[,interaction] <- range.key[[x]]*range.mod[[y]]
        
      } else{
        # replace change var value x
        newd[,change.var] <- log(range.key[[x]] + 1)
        
        # replace moderator value y
        newd[,moderator] <- range.mod[[y]]
        
        # calculate new interaction
        newd[,interaction] <- log(range.key[[x]] + 1) * range.mod[[y]]
        
      }
      
      # matrix multiplication for systematic component
      sys.com <- as.matrix(newd) %*% t(beta.draws)
      
      # average over observations for each draw
      ev <- as.data.frame(sys.com) %>%
        summarise_all(funs(mean))
      
      if(dv.log){
        # convert to original scale of outcome
        ev <- exp(ev) %>%
          mutate(mod = range.mod[[y]])
        
        return(ev)
        
      } else{
        ev <- ev %>%
          mutate(mod = range.mod[[y]])
        
        return(ev)
      }
    })
    
    # add set identifier
    sim.out <- sim.out.sub %>%
      mutate(setx = range.key[[x]])
    
    return(sim.out)
  })
  
  # save data sets
  save(sim.out.cn.fd, file = paste(RESULTS, "pew-sim-out-cn-ug-fd.RData", sep = ""))
  
} else {
  
  # load
  load(file = paste(RESULTS, "pew-sim-out-cn-ug-fd.RData", sep = ""))
  
}

# calculate pv for each setx and moderator value
ev.cn.fd <- sim.out.cn.fd %>%
  gather(draw, value, -setx, -mod) %>%
  group_by(setx, mod) %>%
  dplyr::summarize(mean = mean(value, na.rm = TRUE),
                   lwr = quantile(value, probs = 0.025),
                   upr = quantile(value, probs = 0.975)) %>%
  ungroup() %>%
  mutate(Stage = ifelse(mod == 0, "Pre-anticorruption", "Post-anticorruption"),
         Stage = factor(Stage, levels = c("Pre-anticorruption", "Post-anticorruption")))

ev.cn.fd

# calculate first differences (dif before and after shock)
fd.out <- sim.out.cn.fd %>%
  gather(draw, value, -setx, -mod) %>%
  spread(setx, value) %>%
  rename(one_sd_under = 3, 
         one_sd_above = 4) %>%
  mutate(fd = one_sd_above - one_sd_under) %>%
  group_by(mod) %>%
  dplyr::summarize(mean = mean(fd, na.rm = TRUE),
                   lwr = quantile(fd, probs = 0.025),
                   upr = quantile(fd, probs = 0.975)) %>%
  ungroup()

fd.out

# as % of variation in outcome: -0.04
fd.out$mean[2]/(max(df$cnthreat, na.rm = TRUE) - min(df$cnthreat, na.rm = TRUE))

# as s.d.: -0.11
fd.out$mean[2]/sd(df$cnthreat, na.rm = TRUE)


################################################################################
# contrast with the effect of education
################################################################################
# 1 sd above/below mean
range.key <- c(mean(df$education, na.rm = TRUE) -
                   1 * sd(df$education, na.rm = TRUE),
                 mean(df$education, na.rm = TRUE) +
                   1 * sd(df$education, na.rm = TRUE))
range.key

# set parameters
sim.df <- df
fm <- did.cn.ols.f
change.var <- "education"
range.key <- range.key
n.draws <- 1000
dv.log <- FALSE
range.log <- FALSE

# set seed for simulation
seed <- 1234
set.seed(seed)

# load library to draw from multivariate normal
library(MASS)
beta.draws <- mvrnorm(n.draws, coef(fm), fm$clustervcv)

# extract model matrix
model.df.full <- as.data.frame(fm$X)

# for each value of change var
sim.out.edu <- map_df(1:length(range.key), function(x){
  
    # print
    cat(paste("Simulating predicted probabilities for ", x, " of ",
              length(range.key), " ", change.var, " values\n", sep = ""))
    
    # create temp data
    newd <- model.df.full
    
    if(range.log){
      
      # replace change var value x
      newd[,change.var] <- range.key[[x]]
      
    } else{
      
      # replace change var value x
      newd[,change.var] <- range.key[[x]]
      
    }
    
    # matrix multiplication for systematic component
    sys.com <- as.matrix(newd) %*% t(beta.draws)
    
    # average over observations for each draw
    ev <- as.data.frame(sys.com) %>%
      summarise_all(funs(mean))
    
    if(dv.log){
      
      # convert to original scale of outcome
      ev <- exp(ev) 
      
    } else{
      
      ev <- ev 
      
    }
  
  # add set identifier
  sim.out <- ev %>%
    mutate(setx = range.key[[x]])
  
  return(sim.out)
})

# calculate pv for each setx
ev.edu <- sim.out.edu %>%
  gather(draw, value, -setx) %>%
  group_by(setx) %>%
  dplyr::summarize(mean = mean(value, na.rm = TRUE),
                   lwr = quantile(value, probs = 0.025),
                   upr = quantile(value, probs = 0.975)) %>%
  ungroup()

ev.edu

# effect of increasing education from 1 s.d. below mean to above: -0.09
round(ev.edu$mean[2] - ev.edu$mean[1], 2)

# compare effect of exposure: 0.833
round(fd.out$mean[2], 3)/round(ev.edu$mean[2] - ev.edu$mean[1], 2)


################################################################################
## Figure 3. Predicted Perception of China Threat
################################################################################
# set parameters
sim.df <- df
fm <- did.cn.ols.f
key.var.length <- 50

range.key <- seq(ifelse((mean(df$log_china_undergraduate, na.rm = TRUE) -
                           2 * sd(df$log_china_undergraduate, na.rm = TRUE)) < 0,
                        min(df$log_china_undergraduate, na.rm = TRUE)),
                 mean(df$log_china_undergraduate, na.rm = TRUE) +
                   2 * sd(df$log_china_undergraduate, na.rm = TRUE),
                 length.out = key.var.length)
range.key

# calculate pv for each setx and moderator value
ev <- sim.out.cn %>%
  gather(draw, value, -setx, -mod) %>%
  group_by(setx, mod) %>%
  dplyr::summarize(mean = mean(value, na.rm = TRUE),
            lwr = quantile(value, probs = 0.025),
            upr = quantile(value, probs = 0.975)) %>%
  ungroup() %>%
  mutate(Stage = ifelse(mod == 0, "Pre-anticorruption", "Post-anticorruption"),
         Stage = factor(Stage, levels = c("Pre-anticorruption", "Post-anticorruption")))

# calculate first difference (dif before and after shock)
fd.cn <- sim.out.cn %>%
  gather(draw, value, -setx, -mod) %>%
  spread(mod, value) %>%
  mutate(fd = `1` - `0`) %>%
  group_by(setx) %>%
  dplyr::summarize(mean = mean(fd, na.rm = TRUE),
                   lwr = quantile(fd, probs = 0.025),
                   upr = quantile(fd, probs = 0.975)
                   ) %>%
  ungroup()

# get distribution data
dist.fm.cn <- sim.df[-fm$na.action,]

# clean distribution data
dist.fm.cn <- dist.fm.cn %>%
  filter(log_china_undergraduate <= max(range.key)) %>%
  dplyr::select(cnthreat, log_china_undergraduate, anticorrupt) %>%
  dplyr::rename(setx = log_china_undergraduate)

# set parameters
axis.title.size <- 14
ribbon.alpha <- 0.1

# plot
text.p.h <- 0.05
#text.p.v <- 2.21
text.p.v <- 1

p.ev <- ggplot(ev,
               aes(x = setx,
                   y = mean)) +
  geom_ribbon(aes(ymin = lwr,
                  ymax = upr),
              alpha = ribbon.alpha) +
  geom_line(size = 0.4) +
  geom_vline(xintercept = mean(sim.df$log_china_undergraduate, na.rm = TRUE) +
               sd(sim.df$log_china_undergraduate, na.rm = TRUE),
             size = 0.4,
             color = I(hsv(0/12, 7/12, 7/12)),
             linetype = "dashed") +
  geom_vline(xintercept = mean(sim.df$log_china_undergraduate, na.rm = TRUE),
             size = 0.4,
             color = I(hsv(0/12, 7/12, 7/12)),
             linetype = "dashed") +
  facet_wrap(~ Stage, nrow = 1) +
  scale_x_continuous("Chinese International Undergraduate Students Population (log)") +
  scale_y_continuous("Predicted Perceived China Threat (Scale = 1-3)",
                     limits = c(1, 3)) +
  theme_bw() +
  theme(plot.title = element_text(size = axis.title.size + 4,
                                  face = "bold",
                                  margin = margin(0, 0, 20, 0)),
        axis.title.x = element_text(size = axis.title.size - 2,
                                    margin = margin(20, 0, 0, 0)),
        axis.title.y = element_text(size = axis.title.size - 2,
                                    margin = margin(0, 20, 0, 0)),
        strip.text = element_text(size = axis.title.size + 2,
                                  face = "bold"),
        strip.background = element_blank(),
        panel.grid = element_blank(),
        legend.justification = "top") +
  annotate("text",
           x = mean(sim.df$log_china_undergraduate, na.rm = TRUE) +
             sd(sim.df$log_china_undergraduate, na.rm = TRUE) + text.p.h,
           y = text.p.v, label = "+1 s.d.",
           hjust = 0) +
  annotate("text",
           x = mean(sim.df$log_china_undergraduate, na.rm = TRUE) + text.p.h,
           y = text.p.v, label = "mean",
           hjust = 0)

p.ev

pdf(file = paste(FIG_DIR, "Figure-3.pdf", sep = "/"),
    width = 5.5, height = 5)
print(p.ev)
dev.off()

# ggsave(file = paste(FIG_DIR, "Figure-3.eps", sep = "/"), p.ev,
#        width = 9, height = 5, device = cairo_ps)

